Shrinkage

Model comparison (done correctly) helps to choose the model that provides a good representation of the true DGP by penalizing models that overfit. This penalization is achieved mainly by assessing “fit” not on a training data set, but on a hold out test data set.

A complementary approach to work against overfitting is to specify priors that shrink model coefficients towards zero. Such shrinkage priors are typically

Here relative refers to the scale on which a predictor is measured.

To visualize how shrinkage works, we generate a small data set for which we want to estimate the mean:

set.seed(5)
y = (rnorm(15) %>% scale()) + 1
hist(y)

We want a Bayesian estimate of the mean of y: \(\small y \sim normal(mu, 1)\) with two different priors \(\small mu \sim normal(0,\sigma_{wide})\) and \(\small mu \sim normal(0,\sigma_{shrink})\) for mu:

par(mar = c(5,3,.5,.1))
curve(dnorm(x,0,1), -10, 10, n = 500,
      ylab = "density", xlab = "mu", col = "blue", lwd = 3)
curve(dnorm(x,0,3), n = 500, add = T,
      ylab = "density", xlab = "mu", col = adjustcolor("blue",alpha = .5), lwd = 3)
legend("topleft",
       lty = 1, lwd = 3,
       col = c("blue",adjustcolor("blue",alpha = .5)),
       legend = c("mu ~ normal(0,1)","mu ~ normal(0,3)"),
       bty = "n")

Here, we calculate the log-likelihood and the prior probability of different estimates for the mean of y

mus = seq(-1,3,.01)
P = 
  apply(
    data.frame(mu = mus), 1, 
    function(mu)  {
      c(
        mu = mu,
        # P(data| mu)
        llikelihood = sum(dnorm(y,mean = mu, log = T)), 
        # P(mu), mu ~ normal(0,1)
        Psd1 = dnorm(mu,0, sd = 1, log = T),
         # P(mu), mu ~ normal(0,3)
        Psd3 = dnorm(mu,0, sd = 3, log = T)
      )
    }
  )
P[,1:5]
##                   [,1]       [,2]       [,3]       [,4]       [,5]
## mu.mu        -1.000000  -0.990000  -0.980000  -0.970000  -0.960000
## llikelihood -50.784078 -50.484828 -50.187078 -49.890828 -49.596078
## Psd1.mu      -1.418939  -1.408989  -1.399139  -1.389389  -1.379739
## Psd3.mu      -2.073106  -2.072001  -2.070906  -2.069823  -2.068751

Next, we calculate the posterior probabilities:

post1 = exp(P["llikelihood",] + P["Psd1.mu",])
post3 = exp(P["llikelihood",] + P["Psd3.mu",])

Now we can show likelihood, prior and posterior together:

par(mfrow = c(3,1), mar=c(2.75,2.75,0,.25), mgp=c(1.25,.1,0), tck=-.01)

curve(dnorm(x,0,1), min(mus), max(mus), n = 500,
      ylab = "P(mu)", xlab = "", col = "blue", lwd = 3)
curve(dnorm(x,0,3), n = 500, add = T,
      ylab = "density", xlab = "", col = adjustcolor("blue",alpha = .5), lwd = 3)
legend("topright",
       lty = 1, lwd = 3,
       col = c("blue",adjustcolor("blue",alpha = .5)),
       legend = c("mu ~ normal(0,1)","mu ~ normal(0,3)"),
       bty = "n")
mtext("Prior", line = -2, adj = .05)

plot(mus,exp(P["llikelihood",]),'l', xlab = "", ylab = "P(y|mu)", col = "red", lwd = 2)
mtext("Likelihood", line = -2, adj = .05)

plot(mus,post1,'l', xlab = "mu", ylab = "P(y|mu) * P(mu)", col = "purple", lwd = 2)
lines(mus,post3,'l', col = adjustcolor("purple",alpha = .5), lwd = 2)
abline(v = c(mus[which.max(post1)],mus[which.max(post3)]), lty = 3,
       col = c("purple",adjustcolor("purple",alpha = .5)))
legend("topright",
       lty = 1, lwd = 3,
       col = c("purple",adjustcolor("purple",alpha = .5)),
       legend = c("mu ~ normal(0,1)","mu ~ normal(0,3)"),
       bty = "n")
mtext("Posterior", line = -2, adj = .05)

We zoom in to see more clearly how the narrow prior shrinks the estimated mu towards zero

par(mar = c(5,3,.5,.1))
ids = which(mus > 0 & mus < 1.5)
plot(mus[ids],post1[ids],'l', xlab = "mu", ylab = "P(y|mu) * P(mu)", col = "purple", lwd = 2)
lines(mus[ids],post3[ids],'l', col = adjustcolor("purple",alpha = .5), lwd = 2)
abline(v = c(mus[which.max(post1)],mus[which.max(post3)]), lty = 3,
       col = c("purple",adjustcolor("purple",alpha = .5)))
arrows(x0 = mus[which.max(post3)],
      x1 = mus[which.max(post1)],
      y0 = mean(c(max(post1),max(post3))),
      lwd = 3, length = .15)
legend("topright",
       lty = 1, lwd = 3,
       col = c("purple",adjustcolor("purple",alpha = .5)),
       legend = c("mu ~ normal(0,1)","mu ~ normal(0,3)"),
       bty = "n")

An example

To show that shrinkage works, we estimate spline models with different standard deviations on regression coefficients for the simulated income / well being data above.

N = 1000
set.seed(123456)
x = sort(rnorm(N,mean = 0))
bf.s = splines::bs(x,df = 3)
bf.b = matrix(c(1,1,1),ncol = 1)
mu = bf.s %*% bf.b
y = rnorm(N, mu , sd = .1)
par(mfrow = c(1,1))
plot(x,y)
lines(x,mu, col = "red", lwd = 2)

The following figure shows the estimated relationships for different samples drawn from the population.

Hopefully you can see that large deviations between the true DGP in red and the estimated DGP in blue are less frequent when the prior on regression coefficients is narrow (top left) compared to when it is wider (bottom right).

To confirm this, the following plot shows deviances from a simulation that

  • samples 1000 times 75 training and 75 test data points
  • estimates model parameters on the training set
  • calculates deviances for the training and test data sets.

The following figure shows deviances +/- 1 sd.

load("sim_lppd.Rdata")

plot_deviances = function() {
  D.test = -2*elpd.test
D.train = -2*elpd.train
m.test = colMeans(D.test)
m.train = colMeans(D.train)

within_sd = function(m) {
  apply(apply(m,2, function(x) x - rowMeans(m)), 2, sd)
}

lower.test = m.test - within_sd(elpd.test)
upper.test = m.test +  within_sd(elpd.test)
lower.train = m.train - within_sd(elpd.train)
upper.train = m.train + within_sd(elpd.train)

par(mfrow = c(2,1), mar=c(2.5,2.5,.5,.5), mgp=c(1,.1,0), tck=-.01)
ylim = range(c(lower.train, upper.train))
xs = 1:ncol(elpd.test)
plot(xs,m.train, ylim = ylim, main = "Training data",
     ylab = "Deviance", xlab = "", xaxt = "n")
axis(1, at = 1:5, labels = c(1,2,5,10,20))
arrows(xs,y0 = lower.train, y1 = upper.train, length = 0)


ylim = range(c(lower.test, upper.test))
plot(xs+.2, m.test, col = "blue", pch = 16, ylim = ylim, main = "Test data",
      ylab = "Deviance", xlab = "sd(b)", xaxt = "n")
arrows(xs+.2,y0 = lower.test, y1 = upper.test, length = 0, col = "blue")
axis(1, at = 1:6, labels = b.sds)
}

plot_deviances()

Indeed, we see that while models with wider priors on the regression coefficients have a lower deviances in the training data, they have larger deviances for the test data.

This becomes also clear, if we plot many estimated DGPs together. In the following figure, each blue lines show the expected y values inferred from 100 random samples and the red lines the true relationship between x an y.

Recap last week: From entropy to lppd

\[ H(p) = \sum_{i=1}^{n} p_i log(p_i) \]

\[ D_{KL}(p,q) = \sum_i p_i \,log(p_i) - p_i \, log(q_i) \]

\[ lppd = \sum_{i=1}^{n} log(p(y|theta)) \]

Cross validation

Cross validation is a general term that refers to a situation where model performance is used on a different data set than used for parameter estimation.

So far we have implemented some simple cross validation manually, by simply simply splitting our total sample in half. However, data can also be split differently, e.g. 20% training data 80% test data or the other way around.

One popular way to split data to fit the data on \(\small N-1\) data points and to use the \(\small N\)th data point as test data. This is referred to as Leave one out cross validation or LOO-CV.

One thing that is easily implemented with LOO-CV is to calculated the deviance as the average deviance over the \(\small N\) LOO-CV deviances. for LOO-CV, there are always only N-1 training data sets.

This is different for alternative schemes. For instance, at a samples size of 20, there are 1.84756^{5} to construct the training data set. Here, averaging about all possible test-data deviances would be too computationally expensive.

The key thing to keep in mind when doing cross validation is that dependent observations (more specifically, observations with correlated errors) should always be kept in either test or training data sets. Such complications plays e.g. a role in time series or hierarchically organized data sets.

LOO-CV and PSIS-LOO

The computationally challenging part of LOO-CV is that one needs to estimate the model \(\small N-1\) times.

Fortunately, one can approximate LOO-CV with a method called Pareto smoothed importance sampling (PSIS-LOO). Here, not all \(\small lppd\) receive the same weight when calculating the deviance, but

  • \(\small lppd\) values of observations that have a strong influence receive a higher weight and
  • Pareto-smoothing (of the tails of the \(\small lppd\) distribution) is used to get better weights.

Information criteria

The goal of information criteria is to give us the information out of sample prediction (cross validation) gives us, without that we need to split the data into test and training set.

Information criteria are calculated from two quantities

All well known information criteria, like AIC, and BIC and maybe less well know like DIC, WAIC use the same deviance term, but they differ in their penalty:

Never heard of WAIC, why should you used it?

Are model comparison criteria any good?

The benchmark is out of sample prediction or cross validation, where we

  • draw first a training sample from the DGP,
  • estimate model parameters,
  • draw a test sample (of same size) and
  • estimate the out of sample deviance as our parameter of interest.

In the following figure from the book, these cross validation values are represented by dots (empty for flat priors black for shrinkage priors).

Lines in the figures are average values over 1000 simulations and show that on average information criteria, (LOO)-CV, PSIS-LOO and WAIC do a good job of approximating full cross validation.

When looking at averages, positive and negative differences to full cross validation can cancel. Therefore, it is good to also look at the average absolute, i.e. for each simulated data set we calculate the absolute of the differences between cross validation llpd and the approximation. This is shown in the following figure:

Bottom line: PSIS-LOO and WAIC are relatively close to full cross validation and can be computed relatively cheaply. In practice, it is good to use both and take discrepant results are a warning signal.

Model comparison

Information criteria are often used for model selection, but this is not the only way and should not even be the most important way to use the them.

More generally, we can think of two goals of model comparison:

Model comparison should not be used for covariate selection when we want to estimate causal effects. When our interest is in causal effects, covariate selection should be determined based on the DAG1.

When model selection leads astray

To illustrate why model comparison is not useful for covariate selection, we can go back to the example about the effect of treatment on well being:

source("../Chapter6/dag_utils.R")
par(mfrow = c(1,2))
M_dag = dagitty(
  "dag{
  M->W;
  T->M;
  T->W
  }")
coord.list = 
  list(
    x=c(T=0,W=2,M=1),
    y=c(T=0,W=0,M=-1))
coordinates(M_dag) = coord.list
drawmydag(M_dag,cex = 2)

N = 150
set.seed(1)
T = rnorm(N)
M = T + rnorm(N)
W = 0.1*T + 0.5*M + rnorm(N)
grayblue = colorRampPalette(c("grey","blue"))
par(mar=c(2.5,2.5,.5,.5), mgp=c(1,.1,0), tck=-.01)
plot(W~T, col = grayblue(N)[rank(M)], pch = 16)
legend("topleft",pch = 16, col = c("gray","blue"), 
       legend = c("low M", "high M"), bty = "n")

Now we can estimate different models that use combinations of T and M to predict W.

dt = list(T = T, M = M, W = W)

# adjustment for treatment (incorrect model)
W.TM = quap(
  alist(
    W ~ dnorm(mu, sigma),
    mu <- a + bT*T + bM*M,
    a ~ dnorm(0,1),
    bT ~ dnorm(0,1),
    bM ~ dnorm(0,1),
    sigma ~ dexp(1)), 
  data = dt)

# no adjustment for treatment (correct model)
W.T = quap(
  alist(
    W ~ dnorm(mu, sigma),
    mu <- a + bT*T,
    a ~ dnorm(0,1),
    bT ~ dnorm(0,1),
    sigma ~ dexp(1)), 
  data = dt)

# effect of mediator (incorrect model)
W.M = quap(
  alist(
    W ~ dnorm(mu, sigma),
    mu <- a + bM*M,
    a ~ dnorm(0,1),
    bM ~ dnorm(0,1),
    sigma ~ dexp(1)), 
  data = dt)

# no effect (incorrect model)
W.a = quap(
  alist(
    W ~ dnorm(mu, sigma),
    mu <- a,
    a ~ dnorm(0,1),
    sigma ~ dexp(1)), 
  data = dt)

To see the WAIC for the model W.TM we use the WAIC function from the rethinking package.

WAIC(W.TM) %>% round(1)
##    WAIC   lppd penalty std_err
## 1 441.1 -216.4     4.2    17.4

To compare the models, we use the compare function:

compare(W.TM, W.T, W.M, W.a) %>%  round(1)
##       WAIC   SE dWAIC  dSE pWAIC weight
## W.M  439.8 17.5   0.0   NA   3.2    0.7
## W.TM 441.2 17.6   1.4  1.7   4.3    0.3
## W.T  473.2 17.0  33.4 12.3   3.4    0.0
## W.a  499.8 19.3  60.0 16.1   2.3    0.0

Here is an explanation of the columns:

  • WAIC: WAIC value for each model
  • SE: Standard error of the WAIC (information criterion value to the left)
  • dWAIC: WAIC difference between the best model and the row model
  • dSE: Standard error of the WAIC difference
  • pWAIC: Penalty term (approximate number of effective parameters)
  • weight: Model weight for an ensemble prediction.

We see that in a scenario where a good part of the treatment effect was mediated by motivation (the DPG is M = T + rnorm(N), W = 0.125*T + 0.5*M + rnorm(N) ) the model we should use to estimate the treatment effect is not the best model, and even worse than a model that only uses the mediator.

One important benefit of WAIC (and PSIS-LOO) is that they come with standard error, so that we can compare differences in WAIC with our uncertainty of the difference. Looking at the two best models, we see that W.M is around 0.9 WIC better than W.TM and that the standard error of this difference is 1.8. Given that WAIC difference is firmly within two times dSE we cannot conclude that the W.M model is certainly better than the W.TM model. In comparison, the W.M model is certainly better than the W.T model with a WIC differences of 33.1 and a standard error of NA.

This more easily seen in a plot:

plot(compare(W.TM, W.T, W.M, W.a))
legend("topright",
       pch = c(16,1,2),
       legend = c("sample deviance",
                  "WAIC +/- 2*SE",
                  "WAIC +/- 2*dSE"),
       bty = "n")

What this plot also shows is that if we wanted to compared models W.T and W.a to decide if treatment was effective, the standard errors of the difference show us that we would not be very certain that the true model is better, even though the treatment effect is clearly different from zero:

precis(W.T) %>% round(2)
##       mean   sd  5.5% 94.5%
## a     0.02 0.09 -0.13  0.17
## bT    0.58 0.10  0.41  0.74
## sigma 1.14 0.07  1.04  1.25

One intuition for this is that if the data are relatively noisy, model comparison techniques will have difficulties to be sure if one model is clearly better then others, even if within one model the effect of treatment is clear. Model comparison techniques only care about out of sample predictions and penalize for additional parameters. If due to noisy data adding a causal variable does not improve prediction by much, this model will not clearly outperform simpler models.

Functional form and robust regression

To correctly estimate effects it is not sufficient to choose the correct adjustment variables, but the functional form should also be (approximately?) correct.

Functional form refers to the form of the function that describes a relationship. The default for gaussian regression models is a linear relationship, but we have seen that relation ships can be non-linear.

Assume that we have collected the data in the following plot.

N = 500
set.seed(2)
x = sort(rnorm(N,mean = 0))
bf.s = splines::bs(x,df = 3)
bf.b = matrix(c(1,1,.8),ncol = 1)
mu = bf.s %*% bf.b
nn = 75
sidx = sample(N,nn)
y.n = rnorm(nn, mu[sidx] , sd = .1)
y = rstudent(nn, nu = 3, mu = mu[sidx], sigma = .1)
par(mfrow = c(1,1))
plot(x[sidx],y, xlab = "x", pch = 16)

#points(x[sidx],y.n, col = adjustcolor("black",alpha = .25))
#lines(x,mu, col = "red", lwd = 2)
x = x[sidx]

Now we want to know if the relationship between x and y is linear or non linear.

To do this, we fit a series of models of increasing complexity. To simplify this, we set up a basic model and define design matrices for different models:

quap.spline.model = 
  alist(
    y ~ dnorm(mu,sigma),
    mu <- X %*% b ,
    b ~ dnorm(0,1),
    sigma ~ dnorm(.25,.1)
  )

library(splines)
# linear regression
LM = model.matrix(~ 0 + x) %>% cbind(1)
# splines with different numbers of knots
B3 = bs(x, df = 3) %>% cbind(1)
B6 = bs(x, df = 6) %>% cbind(1)
B18 = bs(x, df = 18) %>% cbind(1)

Before we fit the models, lets look at prior predictions for mu:

pp = function(X) {
  o = order(x)
  b = matrix(rnorm(ncol(X)*55, sd = 1),ncol = 55)
  yhat = X %*% b
  matplot(x[o],yhat[o,],type = 'l', col = "blue", lty = 1, xlab = "x", ylab = "x")
}

par(mfrow = c(2,2), mar=c(2.5,2.5,1,.5), mgp=c(1,.1,0), tck=-.01)
pp(LM); title("linear model")
pp(B3); title("splines with 3 knots")
pp(B6); title("splines with 6 knots")
pp(B18); title("splines with 18 knots")

With the basic model with have set up, we can now easily fit the linear regression and the spline models.

q.B3 = 
  quap(quap.spline.model,
       data = list(X = B3, y = y, x = x),
       start = list(b=rep(0, ncol(B3)), sigma = .1))
q.B6 = 
  quap(quap.spline.model,
       data = list(X = B6, y = y, x = x),
       start = list(b=rep(0, ncol(B6)), sigma = .1))
q.B18 = 
  quap(quap.spline.model,
       data = list(X = B18, y = y, x = x),
       start = list(b=rep(0, ncol(B18)), sigma = .1))
q.LM = 
  quap(quap.spline.model,
       data = list(X = LM, y = y, x = x),
       start = list(b=rep(0, ncol(LM)), sigma = .1))

# work around a bug
attr(q.B3,"nobs") = 50
attr(q.B6,"nobs") = 50
attr(q.B18,"nobs") = 50
attr(q.LM,"nobs") = 50

And we plot the model predictions:

par(mfrow = c(1,1), mar=c(3,3,.5,.5), mgp=c(1.25,.1,0), tck=-.01)
plot_preds = function(q.fit, add = FALSE, col = "blue") {
  mu.mat = link(q.fit,n = 1e4)
  if(is.list(mu.mat)) mu.mat = mu.mat[["mu"]]
  mu = apply(mu.mat,2,mean)
  mu.ci = apply(mu.mat,2,function(x) quantile(x,c(.05,.95)))
  x = q.fit@data[["x"]]
  o = order(x)
  if (add == FALSE) {
    plot(x,y, ylim = range(mu.ci))
  }
  lines(x[o],mu[o],'l', col = col)
  shade(mu.ci[,o],x[o], col = adjustcolor(col,alpha = .25))
}

plot(x,y, ylim = c(0,1.35), pch = 16)
plot_preds(q.LM, add = TRUE, "orange")
plot_preds(q.B3, add = TRUE, "blue")
plot_preds(q.B6, add = TRUE, "purple")
plot_preds(q.B18, add = TRUE, "violet")

legend("topleft",
       fill = c("orange","blue","purple","violet"),
       legend = c("linear model", paste(c(3,6,18),"knots")),
       bty = "n")

Unsurprisingly, more flexible models fit the data better. This can be seen if we use compare with WAIC:

compare(q.B3,q.B6,q.B18,q.LM, func = "WAIC") %>% round(2)
##         WAIC    SE dWAIC   dSE pWAIC weight
## q.B3  -61.03 19.81  0.00    NA  5.95   0.84
## q.B6  -57.75 23.74  3.28  6.31  9.60   0.16
## q.LM  -43.90 19.77 17.13 10.82  5.69   0.00
## q.B18 -41.79 22.10 19.24  8.51 22.65   0.00

Now we use PSIS-LOO to compare the models:

compare(q.B3,q.B6,q.B18,q.LM, func = "PSIS", n = 1e5) %>% round(2)
## Some Pareto k values are high (>0.5). Set pointwise=TRUE to inspect individual points.
## Some Pareto k values are very high (>1). Set pointwise=TRUE to inspect individual points.
## Some Pareto k values are very high (>1). Set pointwise=TRUE to inspect individual points.
## Some Pareto k values are high (>0.5). Set pointwise=TRUE to inspect individual points.
##         PSIS    SE dPSIS   dSE pPSIS weight
## q.B3  -58.86 20.72  0.00    NA  7.11   0.97
## q.B6  -51.74 26.44  7.12  8.18 12.63   0.03
## q.LM  -42.65 20.56 16.21 10.99  6.35   0.00
## q.B18 -28.10 26.39 30.75 11.48 29.58   0.00

This warning tells us that there are some very influential data points that determine to a large degree the PSIS-LOO values. Even if we are tempted to compare models now, there is really no point because we can’t trust the PSIS-LOO values.

To improve the situation, we investigate which points are influential. To do this, we can use PSIS with the flag pointwise == TRUE to get the parameter k for each data point. “k” is the shape parameter of the Pareto distribution, and values larger .7 / 1 indicate problems.

plot_influencial = function(q.fit, ylim = NULL, xlim = NULL) {
  plot(x,y, pch = 16, ylim = ylim, xlim = xlim)
  plot_preds(q.fit, add = TRUE, "blue")
  PSISLOO = PSIS(q.fit, pointwise = TRUE,n = 5e4)
  points(x[which(PSISLOO$k > .5)],y[which(PSISLOO$k > .5)], pch = 16, col = "orange")
  points(x[which(PSISLOO$k > 1)],y[which(PSISLOO$k > 1)], pch = 16, col = "red")
  text(2,.2, paste("sigma = ",round(coef(q.fit)["sigma"],3)), pos = 2)
  legend("topleft", pch = 16,
         col = c("orange","red"),
         legend = c("k > .5", "k > 1"), bty = "n")
}

par(mfrow = c(2,2),  mar=c(3,3,1,.5), mgp=c(1.25,.1,0), tck=-.01)
plot_influencial(q.LM); title("linear model")
## Some Pareto k values are high (>0.5). Set pointwise=TRUE to inspect individual points.
plot_influencial(q.B3); title("splines, 3 nodes")
## Some Pareto k values are high (>0.5). Set pointwise=TRUE to inspect individual points.
plot_influencial(q.B6); title("splines, 6 nodes")
## Some Pareto k values are very high (>1). Set pointwise=TRUE to inspect individual points.
plot_influencial(q.B18); title("splines, 18 nodes")
## Some Pareto k values are very high (>1). Set pointwise=TRUE to inspect individual points.

Note that more complex models have a higher number of influential data points. The intuition here is that the flexibility of muallows the error variance to be smaller. The price one pays is that data points there are far away from mu become very influential because the density fall off quickly.

The PSIS warnings are another instance of the folk theorem of statistical modelling which states that modeling and estimation problems are often a result of mis-specified statistical models.

The figure makes clear that the model does a bad job dealing with outliers. One method to deal with outliers and that comes under the name “robust regression” uses the student t distribution as an error model (instead of the normal distribution.):

curve(dnorm(x, mean = , sd = 1), -4, 4, n = 500, ylab = "density", lty = 3)
curve(dstudent(x, nu = 2, mu = 0, sigma = 1), -4, 4, n = 500,  add = TRUE)
legend("topleft", lty = c(1,3),
       legend = c("dnorm(0,1)","dstudent(0,2,1)"),
       bty = "n")

Because the density of the student t distribution is lower close to zero and higher longer away from zero, observed values that are far away from mu will have a larger likelihood and will this be less influential.

The next figure shows that by using a student t error distribution, we can increase the likelihood of values far away from mu, decrease the likelihood of values close to mu and thus reduce the relative importance of the outlier values.

plot_influencial(q.B6, xlim = c(min(q.B6@data$x),2.5), ylim = c(-.2,1.5))
## Some Pareto k values are very high (>1). Set pointwise=TRUE to inspect individual points.
yhat = colMeans(link(q.B6,n = 1e4))
sigma = coef(q.B6)["sigma"]
PSISLOO = PSIS(q.B6, pointwise = TRUE, n = 5e4)
## Some Pareto k values are very high (>1). Set pointwise=TRUE to inspect individual points.
xs = seq(-1,1,.01)
d = dnorm(xs,0,sigma)/4
ds = dstudent(xs,0,nu = 2, sigma)/4

for (j in which(PSISLOO$k > .5)) {
  lines(d+x[j],xs+yhat[j], lty = 3)
  lines(ds+x[j],xs+yhat[j])
  lines(x = rep(x[j],2), y = c(min(xs), max(xs))+yhat[j], col = "grey")
}

Now lets re-estimate the data with a student-t error distribution. Here is the basic model.

quap.spline.model = 
  alist(
    y ~ dstudent(3, mu, sigma),
    mu <- X %*% b ,
    b ~ dnorm(0,1),
    sigma ~ dnorm(.25,.1)
  )
q.B3r = 
  quap(quap.spline.model,
       data = list(X = B3, y = y, x = x),
       start = list(b=rep(0, ncol(B3)), sigma = .1))
q.B6r = 
  quap(quap.spline.model,
       data = list(X = B6, y = y, x = x),
       start = list(b=rep(0, ncol(B6)), sigma = .1))
q.B18r = 
  quap(quap.spline.model,
       data = list(X = B18, y = y, x = x),
       start = list(b=rep(0, ncol(B18)), sigma = .1))
q.LMr = 
  quap(quap.spline.model,
       data = list(X = LM, y = y, x = x),
       start = list(b=rep(0, ncol(LM)), sigma = .1))

# work around a bug
attr(q.B3r,"nobs") = 50
attr(q.B6r,"nobs") = 50
attr(q.B18r,"nobs") = 50
attr(q.LMr,"nobs") = 50

And we compare the models again.

compare(q.B3r,q.B6r,q.B18r,q.LMr, func = "PSIS", n = 25e3) %>% round(2)
##          PSIS    SE dPSIS   dSE pPSIS weight
## q.B6r  -81.36 20.65  0.00    NA  6.96   0.91
## q.B3r  -76.80 19.60  4.55  5.57  4.45   0.09
## q.B18r -66.86 20.64 14.50  7.04 17.82   0.00
## q.LMr  -57.30 19.07 24.05 13.22  4.29   0.00

This time there are no warnings!

Here are the new models predictions:

par(mfrow = c(1,1), mar=c(3,3,.5,.5), mgp=c(1.25,.1,0), tck=-.01)
plot(x,y, ylim = c(0,1.35), pch = 16)
plot_preds(q.LMr, add = TRUE, "orange")
plot_preds(q.B3r, add = TRUE, "blue")
plot_preds(q.B6r, add = TRUE, "purple")
plot_preds(q.B18r, add = TRUE, "violet")

legend("topleft",
       fill = c("orange","blue","purple","violet"),
       legend = c("linear model", paste(c(3,6,18),"knots")),
       bty = "n")


  1. Admittedly, things can get more complicated if we are uncertain about which DAG is correct. Model selection criteria can help to select or weight DAGs↩︎